{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AlbertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'tiny-bert-bahasa-cased'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('tiny-bert-bahasa-cased/spiece.model',\n",
       " 'tiny-bert-bahasa-cased/special_tokens_map.json',\n",
       " 'tiny-bert-bahasa-cased/added_tokens.json')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AlbertTokenizer('sp10m.cased.bert.model', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)\n",
    "tokenizer.save_pretrained('tiny-bert-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AlbertTokenizer.from_pretrained('./tiny-bert-bahasa-cased', \n",
    "                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli convert --model_type bert \\\n",
    "#   --tf_checkpoint ../bert/tiny-bert-v1/model.ckpt \\\n",
    "#   --config ../bert/tiny-bert-v1/config.json \\\n",
    "#   --pytorch_dump_output tiny-bert-bahasa-cased/pytorch_model.bin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = BertConfig('tiny-bert-bahasa-cased-combined/config.json')\n",
    "config.vocab_size = 32000\n",
    "config.hidden_size = 312\n",
    "config.intermediate_size = 1200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('./tiny-bert-bahasa-cased-combined/pytorch_model.bin', \n",
    "                                            config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan berbual[SEP]',\n",
       "  'score': 0.00015769545279908925,\n",
       "  'token': 17859},\n",
       " {'sequence': '[CLS] makan ayam dengan kembar[SEP]',\n",
       "  'score': 0.0001448775001335889,\n",
       "  'token': 8289},\n",
       " {'sequence': '[CLS] makan ayam dengan memaklumkan[SEP]',\n",
       "  'score': 0.00013484008377417922,\n",
       "  'token': 6881},\n",
       " {'sequence': '[CLS] makan ayam dengan Senarai[SEP]',\n",
       "  'score': 0.00013061291247140616,\n",
       "  'token': 11698},\n",
       " {'sequence': '[CLS] makan ayam dengan Tiga[SEP]',\n",
       "  'score': 0.00012453157978598028,\n",
       "  'token': 4232}]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('tiny-bert-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./tiny-bert-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "025cd1c9552f476d87107897c7314229",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1023.0, style=ProgressStyle(description…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "69cf9698e0b4482ea90ab5dbfe264696",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=96362955.0, style=ProgressStyle(descrip…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model = BertModel.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "508ef7a889c347cda9d66eabb7c77453",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778744.0, style=ProgressStyle(descripti…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7e0b590f1c574f539d6bf12ad99a8448",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=156.0, style=ProgressStyle(description_…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "50a9d715c7ab4d0cbc8ac2aa4f2a1047",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/tiny-bert-bahasa-cased', \n",
    "                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = torch.tensor([tokenizer.encode(\"husein tk suka mkan ayam\", add_special_tokens=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-0.0894,  0.3321, -0.2377,  ...,  0.9428,  5.1734, -0.6816],\n",
       "         [-0.1833, -0.0646, -0.3805,  ...,  0.4032,  7.5724, -0.1227],\n",
       "         [ 1.0851,  0.4930,  1.0498,  ...,  0.3571,  6.7428, -0.2703],\n",
       "         ...,\n",
       "         [ 0.1324,  0.5455, -0.4745,  ..., -0.1279,  4.4239, -0.1930],\n",
       "         [-0.2522,  0.0767, -0.0873,  ...,  0.6880,  4.6996, -0.3993],\n",
       "         [ 0.2774,  0.3506, -0.1620,  ...,  0.9974,  5.2789, -0.9876]]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    last_hidden_states = model(input_ids)[0]\n",
    "    \n",
    "last_hidden_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan berbual[SEP]',\n",
       "  'score': 0.00015769545279908925,\n",
       "  'token': 17859},\n",
       " {'sequence': '[CLS] makan ayam dengan kembar[SEP]',\n",
       "  'score': 0.0001448775001335889,\n",
       "  'token': 8289},\n",
       " {'sequence': '[CLS] makan ayam dengan memaklumkan[SEP]',\n",
       "  'score': 0.00013484008377417922,\n",
       "  'token': 6881},\n",
       " {'sequence': '[CLS] makan ayam dengan Senarai[SEP]',\n",
       "  'score': 0.00013061291247140616,\n",
       "  'token': 11698},\n",
       " {'sequence': '[CLS] makan ayam dengan Tiga[SEP]',\n",
       "  'score': 0.00012453157978598028,\n",
       "  'token': 4232}]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')\n",
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)\n",
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
